{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "entire-merchant",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "pressed-gateway",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "constitutional-iceland",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils, sas_blob_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "vertical-password",
   "metadata": {},
   "source": [
    "# idfg_swwlf_2020\n",
    "\n",
    "In particular, we want to label sequences with low MDv4 detection confidence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "interesting-aging",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'idfg_swwlf_2020'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/idfg/'\n",
    "path_prefix = 'SWWLF2020/'\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "separated-stevens",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_csv_path = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/idfg_SWWLF2020_all.csv'\n",
    "\n",
    "with open(all_csv_path) as f:\n",
    "    all_csv = pd.read_csv(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "sixth-selling",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11589732"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(all_csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "under-marking",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_path</th>\n",
       "      <th>max_confidence</th>\n",
       "      <th>detections</th>\n",
       "      <th>max_conf_animal</th>\n",
       "      <th>max_conf_person</th>\n",
       "      <th>max_conf_group</th>\n",
       "      <th>max_conf_vehicle</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4172076</th>\n",
       "      <td>SWWLF2020\\R3_mccall_01\\IDFG2307\\SWWLF2020_IDFG...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8914935</th>\n",
       "      <td>SWWLF2020\\R6_02\\IDFG2704\\SWWLF2020_IDFG2704_20...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                image_path  max_confidence  \\\n",
       "4172076  SWWLF2020\\R3_mccall_01\\IDFG2307\\SWWLF2020_IDFG...             0.0   \n",
       "8914935  SWWLF2020\\R6_02\\IDFG2704\\SWWLF2020_IDFG2704_20...             0.0   \n",
       "\n",
       "         detections  max_conf_animal  max_conf_person  max_conf_group  \\\n",
       "4172076         NaN              NaN              NaN             NaN   \n",
       "8914935         NaN              NaN              NaN             NaN   \n",
       "\n",
       "         max_conf_vehicle  \n",
       "4172076               NaN  \n",
       "8914935               NaN  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.sample(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "vietnamese-drain",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'SWWLF2020\\\\R1_01\\\\IDFG1476\\\\SWWLF2020_IDFG1476_20200705_140000_TL_0b.JPG'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.iloc[918957]['image_path']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "thrown-efficiency",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "1000000\n",
      "2000000\n",
      "3000000\n",
      "4000000\n",
      "5000000\n",
      "6000000\n",
      "7000000\n",
      "8000000\n",
      "9000000\n",
      "10000000\n",
      "11000000\n"
     ]
    }
   ],
   "source": [
    "seq_dict = defaultdict(list)\n",
    "\n",
    "seq_count = 0\n",
    "cur_frame_num = 1\n",
    "\n",
    "for i_row, row in all_csv.iterrows():\n",
    "    \n",
    "    if i_row % 1000000 == 0:\n",
    "        print(i_row)\n",
    "    \n",
    "    im_path = row['image_path'].replace('\\\\', '/').split(path_prefix)[1]\n",
    "    b_parts = os.path.basename(im_path).split('_')\n",
    "    \n",
    "    frame = b_parts[-1].split('.')[0]\n",
    "    if frame.endswith('b'):\n",
    "        continue;  # skip the \"b\" images - they seem to be very close to the previous frame\n",
    "    else:\n",
    "        frame_num = int(frame)\n",
    "    \n",
    "    if frame_num < cur_frame_num:\n",
    "        # new seq\n",
    "        seq_count += 1\n",
    "\n",
    "    seq_dict[seq_count].append({\n",
    "        'file': im_path,\n",
    "        'frame_num': frame_num,\n",
    "        'datetime': b_parts[2],\n",
    "        'seq_id': seq_count,\n",
    "        'location': '_'.join(b_parts[:2]),\n",
    "        'max_conf': row['max_confidence']\n",
    "    })\n",
    "    \n",
    "    cur_frame_num = frame_num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ongoing-electron",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1428238"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(seq_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "primary-perry",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11013726"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "total_im = 0\n",
    "for seq in seq_dict.values():\n",
    "    total_im += len(seq)\n",
    "total_im"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "first-thursday",
   "metadata": {},
   "source": [
    "### CSV with false negatives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "stunning-asian",
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/swwlf2020_false_neg_animal.csv'\n",
    "\n",
    "with open(csv_path) as f:\n",
    "    false_negatives_df = pd.read_csv(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "prospective-circular",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8294"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>File</th>\n",
       "      <th>Human</th>\n",
       "      <th>Wildlife</th>\n",
       "      <th>Livestock</th>\n",
       "      <th>Pet_pack_horse</th>\n",
       "      <th>Vehicle</th>\n",
       "      <th>Empty</th>\n",
       "      <th>YoungPresent</th>\n",
       "      <th>Species</th>\n",
       "      <th>Count</th>\n",
       "      <th>max_conf_animal</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3022</th>\n",
       "      <td>3023</td>\n",
       "      <td>SWWLF2020_IDFG0943_20200810_110742_MD_2.JPG</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>human</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.291</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Unnamed: 0                                         File  Human  \\\n",
       "3022        3023  SWWLF2020_IDFG0943_20200810_110742_MD_2.JPG   True   \n",
       "\n",
       "      Wildlife  Livestock  Pet_pack_horse  Vehicle  Empty  YoungPresent  \\\n",
       "3022     False      False            True    False  False         False   \n",
       "\n",
       "     Species  Count  max_conf_animal  \n",
       "3022   human    0.0            0.291  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(false_negatives_df)\n",
    "false_negatives_df.sample(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "wireless-amount",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['wolf', 'turkey', 'bobcat', 'dog_domestic', 'human',\n",
       "       'whitetaileddeer', 'cattle_cow', 'elk', 'moose', 'bear_black',\n",
       "       'bird', 'squirrel', 'rabbit_hare', 'other_comments', 'horse',\n",
       "       'muledeer', 'deer_speciesunknown', 'unknown', 'fox_red',\n",
       "       'ground_squirrel', 'none', 'badger', 'rodent', 'coyote',\n",
       "       'goat_domestic', 'mountain_lion', 'cat_domestic', 'skunk_striped',\n",
       "       'llama', 'sheep_domestic', 'marmot', 'goat_mountain', 'pronghorn',\n",
       "       'porcupine'], dtype=object)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "false_negatives_df['Species'].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "civil-express",
   "metadata": {},
   "source": [
    "We want to identify the sequences from which these false negative images are from, and check if the sequence overall is low-confidence.\n",
    "\n",
    "If it's a time-triggered image, we should get it annotated."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ordinary-counter",
   "metadata": {},
   "outputs": [],
   "source": [
    "false_negs_species = {}\n",
    "false_negs_count = {}\n",
    "\n",
    "for i_row, row in false_negatives_df.iterrows():\n",
    "    false_negs_species[row['File']] = row['Species']\n",
    "    false_negs_count[row['File']] = int(row['Count']) if not np.isnan(row['Count']) else 0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "intensive-wings",
   "metadata": {},
   "source": [
    "### Filter to sequences with species label we get from the false negatives list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "confirmed-provider",
   "metadata": {},
   "outputs": [],
   "source": [
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "basic-decrease",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1428239/1428239 [00:15<00:00, 89847.53it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "3847"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = []\n",
    "num_images = 0\n",
    "\n",
    "for seq_id, seq in tqdm(seq_dict.items()):\n",
    "    if len(seq) == 0:\n",
    "        continue\n",
    "    \n",
    "    species = None\n",
    "    location = seq[0]['location']\n",
    "    date = seq[0]['datetime']  # only a date, no timestamp\n",
    "    \n",
    "    for im in seq:\n",
    "        basename = os.path.basename(im['file']) \n",
    "        if basename in false_negs_species:\n",
    "            species = false_negs_species[basename]\n",
    "            count = false_negs_count[basename]\n",
    "    \n",
    "    if species is None:  # no label, exclude\n",
    "        continue\n",
    "        \n",
    "    images = deepcopy(seq)\n",
    "    \n",
    "    for im in images:\n",
    "        del im['seq_id'],\n",
    "        del im['location']\n",
    "        \n",
    "    sequences.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'md_missed_{seq_id}',\n",
    "        'location': location,\n",
    "        'class': [species],\n",
    "        'datetime': date,\n",
    "        'images': images\n",
    "    })\n",
    "    num_images += len(images)\n",
    "\n",
    "    \n",
    "len(sequences) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "failing-shuttle",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "216572"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "strange-month",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'idfg_swwlf_2020',\n",
       "  'seq_id': 'md_missed_219772',\n",
       "  'location': 'SWWLF2020_IDFG1763',\n",
       "  'class': ['elk'],\n",
       "  'datetime': '20200823',\n",
       "  'images': [{'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203309_MD_1.JPG',\n",
       "    'frame_num': 1,\n",
       "    'datetime': '20200823',\n",
       "    'max_conf': 0.7709999999999999},\n",
       "   {'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203309_MD_2.JPG',\n",
       "    'frame_num': 2,\n",
       "    'datetime': '20200823',\n",
       "    'max_conf': 0.212},\n",
       "   {'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203310_MD_3.JPG',\n",
       "    'frame_num': 3,\n",
       "    'datetime': '20200823',\n",
       "    'max_conf': 0.0}]}]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "similar-auction",
   "metadata": {},
   "outputs": [],
   "source": [
    "# some frame numbers are not unique within the sequence; drop such frames\n",
    "num_dropped = 0\n",
    "\n",
    "for seq in sequences:\n",
    "    frame_numbers = []\n",
    "    frames = []\n",
    "    \n",
    "    for im in seq['images']:\n",
    "        if im['frame_num'] in frame_numbers:\n",
    "            num_dropped += 1\n",
    "            continue\n",
    "        else:\n",
    "            frame_numbers.append(im['frame_num'])\n",
    "        frames.append(im)\n",
    "    seq['images'] = frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "referenced-diploma",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 853 ms, sys: 0 ns, total: 853 ms\n",
      "Wall time: 852 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "relevant-vegetarian",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3847"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "heavy-vault",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12559"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_images = sum([len(seq['images']) for seq in sequences])\n",
    "num_images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "impressive-federation",
   "metadata": {},
   "outputs": [],
   "source": [
    "locations = Counter([seq['location'] for seq in sequences])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "determined-divorce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "517"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(locations)  # most only have one or two sequences at the location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "blocked-physics",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "collective-difficulty",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'dataset': 'idfg_swwlf_2020', 'seq_id': 'md_missed_6234', 'location': 'SWWLF2020_IDFG0808', 'class': ['human'], 'datetime': '20200801', 'images': [{'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG', 'frame_num': 1, 'datetime': '20200801', 'max_conf': 1.0}, {'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161956_MD_2.JPG', 'frame_num': 2, 'datetime': '20200801', 'max_conf': 1.0}, {'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161956_MD_3.JPG', 'frame_num': 3, 'datetime': '20200801', 'max_conf': 0.888}]}\n",
      "{'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG', 'frame_num': 1, 'datetime': '20200801', 'max_conf': 1.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "12559"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "im_to_send = []\n",
    "for seq in sequences:\n",
    "    for im in seq['images']:\n",
    "        if im['file'] == 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG':\n",
    "            print(seq)\n",
    "        \n",
    "        im_to_send.append(im)\n",
    "        if im['file'] == 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG':\n",
    "            print(im_to_send[-1])\n",
    "    # im_to_send.extend(seq['images'])\n",
    "len(im_to_send)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "continuing-tuner",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'SWWLF2020/R1_02/IDFG1501/SWWLF2020_IDFG1501_20200704_151530_MD_1.JPG\\n' in list_to_download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "contemporary-republic",
   "metadata": {},
   "outputs": [],
   "source": [
    "list_to_download = [os.path.join(path_prefix, im['file']) + '\\n' for im in im_to_send]\n",
    "\n",
    "with open(f'/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/{dataset_name}_files.txt', 'w') as f:\n",
    "    f.writelines(list_to_download)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "expressed-electron",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'SWWLF2020/R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG\\n'"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_to_download[-50]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "concerned-strengthening",
   "metadata": {},
   "source": [
    "## Rename and copy to imerit12g folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "provincial-generic",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 83.8 ms, sys: 32.5 ms, total: 116 ms\n",
      "Wall time: 115 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/idfg/SWWLF2020/R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12g/idfg_swwlf_2020.seqmd_missed_1410386.frame3.jpg')"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "\n",
    "not_there = []\n",
    "\n",
    "for seq in sequences:\n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        src_path = os.path.join(container_root, path_prefix, im['file'])\n",
    "        if not os.path.exists(src_path):\n",
    "            not_there.append((im, seq_id))\n",
    "        \n",
    "        frame = im['frame_num']\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))\n",
    "\n",
    "path_pairs[-50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "sharing-birthday",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'file': 'R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG',\n",
       " 'frame_num': 3,\n",
       " 'datetime': '20200902',\n",
       " 'max_conf': 1.0}"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "im_to_send[-50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "durable-uruguay",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12559"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "classical-placement",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "315"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(not_there)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "charged-fiber",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 29.4 s, sys: 37.1 s, total: 1min 6s\n",
      "Wall time: 30.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "def copy_file(src_path, dst_path):\n",
    "    if os.path.exists(src_path):\n",
    "        return copyfile(src_path, dst_path)\n",
    "\n",
    "with ThreadPool(16) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "upper-namibia",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
